Initial Data Analysis (IDA)
Creating Data Frames for each Suburb
house_scraping<- function( location = "2151/Parramatta/"){
# adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
# determine how many pages to scroll through
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
"1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
# get the number of properties and the number of property displayed on each page
find_page_number <- webpage %>% html_nodes("h2") %>% html_text()
find_page_number <- find_page_number[1]
numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page = total number of pages
df <- NULL
for (thispage in c(1:end_page)){
if (thispage %% 5 == 0){
print(paste0( "Processing page ", thispage) )
}
# get website text
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
thispage,
"/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
result <- webpage %>% html_nodes("li") %>% html_text()
# end of the relevant content
result <- result[ 1: grep("current", result) ]
# remove the redundant "listed price"
result <- result[ !grepl("List", result) ]
# remove the price listed with rent
result <- result[ !grepl("Rent", result) ]
# filter information on price and number of bedroom/bathroom/carspace
price_bedroom <- result[ grep("\\$", result)]
price_bedroom <- strsplit( price_bedroom , "\\$")
bedroom <- lapply(price_bedroom, `[`, 1)
bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
price <- lapply(price_bedroom, `[`, 2)
price <- trimws(price)
price <- as.numeric(gsub(",","", price ))
# filter information on sold month and year
# note sometimes the price is not listed , therefore only get the ones with the price
timesold <- result[ grep("\\$", result)-1]
timesold <- trimws( gsub("Sold on","", timesold ))
# whether to use day month year or just month year
timesold <- lapply(timesold , function(x){
check_format <- strsplit(x, "\\s")
if (length(check_format[[1]]) == 3){
x <- dmy(x)
}else if (length(check_format[[1]]) == 2){
x <- my(x)
}else{
x <- as.Date(paste0(x, "-01-01"))
}
x
})
timesold <- do.call("c", timesold)
# get address of these properties
address <- webpage %>% html_nodes("h4") %>% html_text()
# end of the relevant content
address <- address[ 1: grep("Auction History", address) -1 ]
#decide which address contain sold price
sold_info <- grep("Sold on", result) #entry with sold info
price_info <- grep("\\$", result) #entry with price info
contain_price <- sold_info %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record
address <- address[contain_price] #only record those property that has price recorded
temp_df <- data.frame( address = address,
bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) ,
bathroom = as.numeric( unlist( lapply( bedroom, `[`, 2) )) ,
carspace = as.numeric( unlist( lapply( bedroom, `[`, 3) )),
soldprice = price ,
yearsold =timesold )
df <- rbind(df, temp_df)
}
return(df)
}
# suburb name with space need to be joined with "+" sign
df_parramatta <- house_scraping( location = "2150/parramatta/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
df_merrylands <- house_scraping( location = "2160/merrylands/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
## [1] "Processing page 40"
df_auburn <- house_scraping( location = "2144/auburn/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
## [1] "Processing page 30"
## [1] "Processing page 35"
## [1] "Processing page 40"
## [1] "Processing page 45"
## [1] "Processing page 50"
## [1] "Processing page 55"
## [1] "Processing page 60"
df_eastwood <- house_scraping( location = "2122/eastwood/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
df_granville <- house_scraping( location = "2142/granville/")
## [1] "Processing page 5"
## [1] "Processing page 10"
## [1] "Processing page 15"
## [1] "Processing page 20"
## [1] "Processing page 25"
Writing longitude and latitude into dataframe
l_parramatta <- df_parramatta%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 354 addresses to the ArcGIS single address geocoder
## Query completed in: 194.1 seconds
l_merrylands <- df_merrylands%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 407 addresses to the ArcGIS single address geocoder
## Query completed in: 229.6 seconds
l_auburn <- df_auburn%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 577 addresses to the ArcGIS single address geocoder
## Query completed in: 321.1 seconds
l_eastwood <- df_eastwood%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 183 addresses to the ArcGIS single address geocoder
## Query completed in: 109.3 seconds
l_granville <- df_granville%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
## Passing 280 addresses to the ArcGIS single address geocoder
## Query completed in: 150.1 seconds
Function to calculate distance to train station
data_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))/1000
return(dist)
} # function that returns the distance between places within the dataframe using longitude and latitude column, and a fixed location. Now, we just need to substitute fixed_lat and fixed_lon with the desired locations for train stations etc to incorporate into the data
# used google maps for all longitudes and latitudes
parramatta_lat <- -33.8175
parramatta_lon <- 151.0050
l_parramatta_distance <- data.frame(l_parramatta, "distance_to_train_station(km)" = apply(l_parramatta[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], parramatta_lat, parramatta_lon)))
merrylands_lat <- -33.8363
merrylands_lon <- 150.9926
l_merrylands_distance <- data.frame(l_merrylands, "distance_to_train_station(km)" = apply(l_merrylands[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], merrylands_lat, merrylands_lon)))
auburn_lat <- -33.8490
auburn_lon <- 151.0329
l_auburn_distance <- data.frame(l_auburn, "distance_to_train_station(km)" = apply(l_auburn[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], auburn_lat, auburn_lon)))
eastwood_lat <- -33.7899
eastwood_lon <- 151.0821
l_eastwood_distance <- data.frame(l_eastwood, "distance_to_train_station(km)" = apply(l_eastwood[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], eastwood_lat, eastwood_lon)))
granville_lat <- -33.8326
granville_lon <- 151.0120
l_granville_distance <- data.frame(l_granville, "distance_to_train_station(km)" = apply(l_granville[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], granville_lat, granville_lon)))
# c("latitude", "longitude") takes only the latitude and longitude so x[1], x[2] takes 1st and 2nd element of the row x which is latitude and longitude
# The apply function applies the data_distance_between function to each row of the l_chatswood dataframe. The apply function takes three arguments: the dataframe subsetted to the latitude and longitude columns (using l_chatswood[, c("latitude", "longitude")]), the 1 argument to apply the function to each row, and a function that takes the latitude and longitude values of a row as input and calculates the distance to the fixed point.
# x is a row in the dataframe. x[1] is first element of x which is longitude
Classing distance
l_parramatta_distance$distance_class <- cut(l_parramatta_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_merrylands_distance$distance_class <- cut(l_merrylands_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500, 3.000,3.250,3.500,3.750, 4.000))
l_auburn_distance$distance_class <- cut(l_auburn_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_eastwood_distance$distance_class <- cut(l_eastwood_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_granville_distance$distance_class <- cut(l_granville_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
Combining Data
combined_df <-rbind(l_parramatta_distance, l_merrylands_distance, l_auburn_distance, l_eastwood_distance, l_granville_distance)
Filtering Data
combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
par(mfrow=c(1,2))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 185000 345000 422000 454197 535750 1470000
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_2bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 185000 345000 422000 454197 535750 1470000
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 410000 536000 574283 685000 2020000
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_3bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 410000 536000 574283 685000 2020000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 525000 672500 704376 840000 3000000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_4bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 525000 672500 704376 840000 3000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 595000 769004 1010000 1175092 1514000 2090000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))

summary(combined_df_5bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 595000 769004 1010000 1175092 1514000 2090000
Filtering Data by Carspaces and Bedrooms
combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)
combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)
combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)
combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)
combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)
combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)
combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)
combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)
combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)
combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)
combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)
combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)
combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)
combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)
1 bedroom
ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 243000 334750 426500 426500 518250 610000
2 bedrooms
ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 230000 343875 419500 455068 530000 1470000
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 225000 365000 460200 483116 585500 1120000
3 bedrooms
ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73000 393000 515000 541211 645375 2020000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 425750 595000 604433 700000 1950000
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 425750 595000 604433 700000 1950000
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 429500 471750 488000 547062 575000 770000
4 bedrooms
ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 320000 440000 565000 613430 701500 1625000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 160000 550000 710000 721376 851000 1950000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 606000 781500 904300 832250 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_4bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 456000 580250 595000 629000 691250 800000
5 bedrooms
ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 615000 615000 615000 615000 615000 615000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 595000 842500 1165000 1231101 1571000 2090000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_5bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
Creating a column for Year
combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 260000 337750 412500 489812 552500 1150000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 165000 365000 506000 518858 635000 1545000
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 185000 395000 535500 569420 685000 2020000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_0.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65000 395000 546500 585952 700750 1950000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 380000 481000 546800 660000 1950000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 395000 490000 541084 647498 3000000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 310000 435000 571500 595094 652750 2090000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_1.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 286000 406000 560000 603858 757000 1515000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 160000 400000 491000 499950 605000 800000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 270000 377000 465000 508040 637500 1230000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_2.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 305000 484500 710000 706013 773500 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 555000 555000 555000 555000 555000 555000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 262000 274000 274000 286000 298000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))

summary(combined_df_3.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 316000 360000 398110 455250 664000